require(quanteda)

dat <- readRDS("data/muller2021/data_manifestos_classified.rds") %>% 
    subset(language == "english")
dat$class <- factor(dat$class, levels = c("Past", "Present", "Future"))
dat$year <- as.integer(dat$year)
corp <- corpus(dat, docid_field = "id", unique_docnames = FALSE)
toks <- tokens(corp, remove_numbers = TRUE, remove_url = TRUE)
saveRDS(toks, "data/tokens_en.rds")

dat_test <- readRDS("data/muller2021/data_sentences_classified_english.rds")
dat_test$class <- factor(dat_test$class, levels = c("Past", "Present", "Future"))
dat_test$year <- as.integer(stringi::stri_sub(dat_test$date, 1, 4))
dat_test$date <- NULL
corp_test <- corpus(dat_test, docid_field = "id", unique_docnames = FALSE)
toks_test <- tokens(corp_test)
saveRDS(toks_test, "data/tokens_test_en.rds")

